﻿using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO.Compression;
using System.IO.Packaging;
using System.Xml;
using System.IO;

namespace WebNet.Core
{
    public class DocxToText
    {
        private const string ContentTypeNamespace = @"http://schemas.openxmlformats.org/package/2006/content-types";

        private const string WordprocessingMlNamespace = @"http://schemas.openxmlformats.org/wordprocessingml/2006/main";

        private const string DocumentXmlXPath =
            "/t:Types/t:Override[@ContentType=\"" +
            "application/vnd.openxmlformats-officedocument." +
            "wordprocessingml.document.main+xml\"]";

        private const string BodyXPath = "/w:document/w:body";

        private string docxFile = "";
        private string docxFileLocation = "";

        public DocxToText(string fileName)
        {
            docxFile = fileName;
        }

        #region ExtractText()
        /// 
        /// Extracts text from the Docx file.
        /// 
        /// Extracted text.
        public string ExtractText()
        {
            if (string.IsNullOrEmpty(docxFile))
                throw new Exception("Input file not specified.");

            // Usually it is "/word/document.xml"

            docxFileLocation = FindDocumentXmlLocation();

            if (string.IsNullOrEmpty(docxFileLocation))
                throw new Exception("It is not a valid Docx file.");

            return ReadDocumentXml();
        }
        #endregion

        #region FindDocumentXmlLocation()
        /// 
        /// Gets location of the "document.xml" zip entry.
        /// 
        /// Location of the "document.xml".
        private string FindDocumentXmlLocation()
        {
            using (Package package = Package.Open(docxFile))
            {
                Uri contentTypesUri = new Uri("/[Content_Types].xml", UriKind.Relative);
                PackagePart contentTypesPackagePart = package.GetPart(contentTypesUri);

                if (contentTypesPackagePart != null)
                {
                    Stream contentTypesStream = contentTypesPackagePart.GetStream();

                    XmlDocument doc = new XmlDocument();
                    doc.PreserveWhitespace = true;
                    doc.Load(contentTypesStream);
                    contentTypesStream.Close();

                    //Create an XmlNamespaceManager for resolving namespaces
                    XmlNamespaceManager namespaceManager = new XmlNamespaceManager(doc.NameTable);
                    namespaceManager.AddNamespace("t", ContentTypeNamespace);

                    // Find location of "document.xml"
                    XmlNode node = doc.DocumentElement.SelectSingleNode(DocumentXmlXPath, namespaceManager);

                    if (node != null)
                    {
                        string location = ((XmlElement)node).GetAttribute("PartName");
                        return location.TrimStart(new char[] { '/' });
                    }
                }               
            }

            return null;
        }
        #endregion

        #region ReadDocumentXml()
        /// 
        /// Reads "document.xml" zip entry.
        /// 
        /// Text containing in the document.
        private string ReadDocumentXml()
        {
            StringBuilder sb = new StringBuilder();

            using (Package package = Package.Open(docxFile))
            {
                PackagePartCollection packageParts = package.GetParts();
                foreach (PackagePart packagePart in packageParts)
                {
                    if (packagePart.Uri.OriginalString.LastIndexOf(docxFileLocation, StringComparison.OrdinalIgnoreCase) == -1)
                    {
                        continue;
                    }

                    Stream documentXml = packagePart.GetStream();

                    XmlDocument xmlDoc = new XmlDocument();
                    xmlDoc.PreserveWhitespace = true;
                    xmlDoc.Load(documentXml);
                    documentXml.Close();

                    XmlNamespaceManager namespaceManager = new XmlNamespaceManager(xmlDoc.NameTable);
                    namespaceManager.AddNamespace("w", WordprocessingMlNamespace);

                    XmlNode node =
                        xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, namespaceManager);

                    if (node == null)
                        return string.Empty;

                    sb.Append(ReadNode(node));

                    break;
                }
            }

            return sb.ToString();
        }
        #endregion

        #region ReadNode()
        /// 
        /// Reads content of the node and its nested childs.
        /// 
        /// XmlNode.
        /// Text containing in the node.
        private string ReadNode(XmlNode node)
        {
            if (node == null || node.NodeType != XmlNodeType.Element)
            {
                return string.Empty;
            }                

            StringBuilder sb = new StringBuilder();
            foreach (XmlNode child in node.ChildNodes)
            {
                if (child.NodeType != XmlNodeType.Element) continue;

                switch (child.LocalName)
                {
                    case "t":                           // Text
                        sb.Append(child.InnerText.TrimEnd());

                        string space =
                            ((XmlElement)child).GetAttribute("xml:space");
                        if (!string.IsNullOrEmpty(space) &&
                            space == "preserve")
                            sb.Append(' ');

                        break;

                    case "cr":                          // Carriage return
                    case "br":                          // Page break
                        sb.Append(Environment.NewLine);
                        break;

                    case "tab":                         // Tab
                        sb.Append("\t");
                        break;

                    case "p":                           // Paragraph
                        sb.Append(ReadNode(child));
                        sb.Append(Environment.NewLine);
                        sb.Append(Environment.NewLine);
                        break;

                    default:
                        sb.Append(ReadNode(child));
                        break;
                }
            }
            return sb.ToString();
        }
        #endregion
    }
}
